In the examples below, we’ll be working with a random sample of 10,000 public Facebook posts by Members of the U.S. Congress. The overall question that we will be trying to answer is: what type of posts gets more likes?
library(DBI)
db <- dbConnect(RSQLite::SQLite(), "~/data/facebook-db.sqlite")
df <- dbGetQuery(db,
"SELECT posts.screen_name, date, posts.type AS post_type,
message, likes_count, comments_count, shares_count,
love_count, haha_count, wow_count, angry_count,
sad_count, gender, congress.type, party
FROM posts JOIN congress
ON congress.screen_name = posts.screen_name
ORDER BY RANDOM()
LIMIT 10000")
## Warning in rsqlite_fetch(res@ptr, n = n): Column `screen_name`: mixed type,
## first seen values of type string, coercing other values of type real
# also available as:
df <- read.csv("fb-congress-data.csv", stringsAsFactors=FALSE)
And now we load the ggplot2 package:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
# base layer
p <- ggplot(df, aes(x=likes_count))
# histogram
p + geom_histogram() ## histogram of likes for each post
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# density plot
p + geom_density() ## density of likes for each post
# transforming scale to log10
p + geom_histogram() + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).
p + geom_density() + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 22 rows containing non-finite values (stat_density).
# why does this line of code drops some observations?
p <- ggplot(df, aes(x=post_type))
# bar chart
p + geom_bar() ## number of posts by type
# bar chart (horizontal)
p + geom_bar() + coord_flip()
# base layer
p <- ggplot(df, aes(x=likes_count, y=comments_count))
# scatter plot
p + geom_point() ## relationship between number of likes and number of comments
p + geom_point() + scale_x_log10() + scale_y_log10() ## log scales
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
p + geom_point() + stat_smooth(na.rm=T)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
p + geom_point() + scale_x_log10() + scale_y_log10() +
stat_smooth()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 652 rows containing non-finite values (stat_smooth).
Another example, but this time with a line plot
counts <- dbGetQuery(db,
"SELECT date, COUNT(1) as post_count
FROM posts
GROUP BY date
ORDER BY date")
p <- ggplot(counts, aes(x=as.Date(date), y=post_count))
p + geom_line() ## line: posts per day
p <- ggplot(df, aes(x=post_type, y=likes_count))
p + geom_boxplot() ## number of likes by type of post
p + geom_boxplot() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).
p + geom_violin() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 22 rows containing non-finite values (stat_ydensity).
p <- ggplot(df, aes(x=likes_count)) ## same with density plot
p + geom_density(aes(color=party)) + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 22 rows containing non-finite values (stat_density).
counts <- dbGetQuery(db,
"SELECT posts.type, congress.party,
COUNT(1) AS post_count
FROM posts JOIN congress
ON congress.screen_name = posts.screen_name
WHERE party != 'Independent'
GROUP BY posts.type, congress.party")
p <- ggplot(counts, aes(x=party, y=type))
p + geom_tile(aes(fill=post_count))
p <- ggplot(df, aes(x=likes_count, y=comments_count, color=log(angry_count)))
p + geom_point()
p + geom_point() + scale_y_log10() + scale_x_log10() +
stat_smooth(method="lm")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 652 rows containing non-finite values (stat_smooth).
p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_point() + scale_x_log10() + scale_y_log10() +
facet_wrap(~post_type, nrow=2) ## grid of plots: 2x4, by post type
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
p <- ggplot(df[df$likes_count>10000,],
aes(x=likes_count, y=comments_count, label=party))
p + geom_text() + scale_x_log10() + scale_y_log10()
## geom_text() to use party names instead of points
Another example, using time and counts by party:
## counting number of posts by party and day
counts <- dbGetQuery(db,
"SELECT SUBSTR(posts.date, 1, 7) AS month,
congress.party,
COUNT(1) AS post_count
FROM posts JOIN congress
ON congress.screen_name = posts.screen_name
WHERE party != 'Independent'
GROUP BY month, congress.party")
p <- ggplot(counts, aes(x=as.Date(paste0(month, '-01')),
y=post_count, group=party))
p + geom_line(aes(color=party)) +
scale_color_manual(values=c("blue", "red"))
## line: posts per month, by party
Other examples:
## scatter plot with dots colored by type of post
p <- ggplot(df[df$likes_count>5000,],
aes(x=likes_count, y=comments_count))
p + geom_point(aes(color=post_type)) + scale_x_log10() + scale_y_log10()
## same for point shape
p <- ggplot(df[df$likes_count>5000,],
aes(x=likes_count, y=comments_count))
p + geom_point(aes(shape=post_type)) + scale_x_log10() + scale_y_log10()
## combining both
p <- ggplot(df[df$likes_count>5000,],
aes(x=likes_count, y=comments_count))
p + geom_point(aes(shape=post_type, color=post_type)) + scale_x_log10() + scale_y_log10()
## this can be very easily extended to multiple scales
p <- ggplot(df[df$likes_count>5000,],
aes(x=likes_count, y=comments_count))
p + geom_point(aes(shape=gender, color=post_type, size=shares_count)) +
scale_x_log10() + scale_y_log10() + scale_size(trans="log10") +
facet_wrap(~post_type, nrow=2)
## Warning: Transformation introduced infinite values in discrete y-axis
## Warning in sqrt(x): NaNs produced
## Warning: Removed 2 rows containing missing values (geom_point).
# baseline
p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_point() + scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## jittering points (useful for counts)
p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_jitter(position = position_jitter(width = .5, height=.5)) +
scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## transparency
p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_jitter(position = position_jitter(width = .5, height=.5), alpha=1/25) +
scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## hexbin
p <- ggplot(df[df$likes_count>0 & df$comments_count>0,],
aes(x=likes_count, y=comments_count))
p + geom_hex() + scale_x_log10() + scale_y_log10() +
scale_fill_continuous(trans="log10")